# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)

Setting up Data

# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
##     HH1   HH2    LN   WM1   WM2   WM3 WMINT   WM4   WM5  WM6D  WM6M  WM6Y   WM8
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     2     3     1     2     3    92    91    92    19    11  2020     2
## 2     1     4     2     1     4     2    92    91    92    18    11  2020     1
## 3     1     9     4     1     9     4    92    91    92    18    11  2020     1
## 4     1    10     4     1    10     4    92    91    92    18    11  2020     2
## 5     1    11     4     1    11     4    92    91    92    19    11  2020     2
## 6     1    11     5     1    11     5    92    91    92    18    11  2020     2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## #   WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## #   WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## #   WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## #   WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## #   WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## #   WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)

# View the first few rows of the new dataframe
summary(female_df)
##      WAGEM          MSTATUS           HH6             HH7           welevel    
##  Min.   :10.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.00  
##  1st Qu.:18.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.00  
##  Median :20.00   Median :1.000   Median :2.000   Median :3.000   Median :2.00  
##  Mean   :20.92   Mean   :1.413   Mean   :1.683   Mean   :3.404   Mean   :2.46  
##  3rd Qu.:23.00   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000   3rd Qu.:3.00  
##  Max.   :47.00   Max.   :9.000   Max.   :2.000   Max.   :6.000   Max.   :9.00  
##  NA's   :2026    NA's   :11      NA's   :11      NA's   :11      NA's   :524   
##    insurance       ethnicity        windex5           CP2       
##  Min.   :1.000   Min.   :1.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :1.135   Mean   :2.035   Mean   :2.494   Mean   :1.431  
##  3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :9.000   Max.   :6.000   Max.   :5.000   Max.   :9.000  
##  NA's   :524                                     NA's   :864    
##       HA1             MT4             MT9             MT11     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :1.000   Median :2.000   Median :1.000   Median :1.00  
##  Mean   :1.215   Mean   :1.664   Mean   :1.365   Mean   :1.12  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.00  
##  Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   :9.00  
##  NA's   :524     NA's   :524     NA's   :2496    NA's   :524
# Rename the columns
female_df <- female_df %>%
  rename(
  age_first_marriage = WAGEM,
  marital_status = MSTATUS,
  area = HH6,
  region = HH7,
  education_level = welevel,
  health_insurance = insurance,
  ethnicity = ethnicity,
  wealth_index = windex5,
#  contraceptive_decision_maker = CP5, #R
  current_contraceptive_use = CP2,
# age_first_sexual_intercourse = SB1, #R
#  currently_married = MA1,
#  partner_age = MA2, #R
  awareness_hiv_aids = HA1,
  used_computer_tablet = MT4,
  used_internet = MT9,
  owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
##        age_first_marriage            marital_status                      area 
##                      2026                        11                        11 
##                    region           education_level          health_insurance 
##                        11                       524                       524 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                       864 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                       524                       524                      2496 
##         owns_mobile_phone 
##                       524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
  mutate(
    current_contraceptive_use = na_if(current_contraceptive_use, 9),
    used_internet = na_if(used_internet, 9),
    health_insurance = na_if(health_insurance, 9),
    education_level = na_if(education_level, 9),
    awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
    used_computer_tablet = na_if(used_computer_tablet, 9),
    owns_mobile_phone = na_if(owns_mobile_phone, 9),
    marital_status = na_if(marital_status, 9)
  )

# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)

# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.92      Mean   :1.408   Mean   :1.683   Mean   :3.404  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :3.000   Max.   :2.000   Max.   :6.000  
##  NA's   :2026       NA's   :18      NA's   :11      NA's   :11     
##  education_level health_insurance   ethnicity      wealth_index  
##  Min.   :0.00    Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.00    1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.00    Median :1.0000   Median :1.000   Median :2.000  
##  Mean   :2.46    Mean   :0.8659   Mean   :1.586   Mean   :2.615  
##  3rd Qu.:3.00    3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.00    Max.   :1.0000   Max.   :4.000   Max.   :5.000  
##  NA's   :525     NA's   :525      NA's   :1148    NA's   :524    
##  current_contraceptive_use awareness_hiv_aids used_computer_tablet
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000      
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:0.0000      
##  Median :1.0000            Median :1.0000     Median :0.0000      
##  Mean   :0.5842            Mean   :0.7975     Mean   :0.3412      
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000      
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000      
##  NA's   :885               NA's   :541        NA's   :532         
##  used_internet    owns_mobile_phone
##  Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :1.0000   Median :1.0000   
##  Mean   :0.6394   Mean   :0.8831   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.0000   Max.   :1.0000   
##  NA's   :2501     NA's   :529

Creating New Variable Access to Media

# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
  mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))

# In later analysis, use access_to_media instead of the 3 separate variables
# Exporting female_df to a CSV file in the current working directory
#write.csv(female_df, "female_df.csv", row.names = FALSE)

Distributions of Data

# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(female_df, aes(x = age_first_marriage)) +
  geom_histogram(fill = "cyan", color = "black") +
  theme_minimal() +
  ggtitle("Histogram of Age at First Marriage") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Saving the histogram
#ggsave("histogram_age_first_marriage.png", plot = afm_hist, width = 8, height = 6, dpi = 300)
# # Plotting a boxplot for 'Age at First Marriage'
# ggplot(female_df, aes(y = age_first_marriage)) +
#   geom_boxplot(fill = "cyan") +
#   labs(title = "Boxplot of Age at First Marriage", y = "Age at First Marriage") +
#   theme_minimal()
# # Bar plot for 'used_internet'
# ggplot(female_df, aes(x = used_internet)) +
#   geom_bar(fill = "yellow") +
#   theme_minimal() +
#   ggtitle("Bar Plot of Used Internet") +
#   xlab("Used Internet") +
#   ylab("Count")

Handling Missing Data

# Before that, let's double check the count of missing values by column
count_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(count_missing)
##        age_first_marriage            marital_status                      area 
##                      2026                        18                        11 
##                    region           education_level          health_insurance 
##                        11                       525                       525 
##                 ethnicity              wealth_index current_contraceptive_use 
##                      1148                       524                       885 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                       541                       532                      2501 
##         owns_mobile_phone           access_to_media 
##                       529                       529
# Make a copy of female_df for imputation
imputed_df <- female_df
# Handle missing values in 'Age at First Marriage'

# Calculate the median value for 'Age at First Marriage', excluding NA values
median_age_first_marriage <- median(imputed_df$age_first_marriage, na.rm = TRUE)

# Impute missing values in 'Age at First Marriage' with the median value
imputed_df$age_first_marriage[is.na(imputed_df$age_first_marriage)] <- median_age_first_marriage
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
  # The mode is the value that appears most frequently in the data
  uniqv <- unique(na.omit(v))  # Omit NA values and get unique values
  uniqv[which.max(tabulate(match(v, uniqv)))]  # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))


# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).

# Calculate the mode for each binary variable
mode_used_internet <- getMode(imputed_df$used_internet)
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_used_computer_tablet <- getMode(imputed_df$used_computer_tablet)
mode_owns_mobile_phone <- getMode(imputed_df$owns_mobile_phone)
mode_access_to_media <- getMode(imputed_df$access_to_media)

# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
  used_internet = ifelse(is.na(used_internet), mode_used_internet, used_internet),
  current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
  health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
  awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
  used_computer_tablet = ifelse(is.na(used_computer_tablet), mode_used_computer_tablet, used_computer_tablet),
  owns_mobile_phone = ifelse(is.na(owns_mobile_phone), mode_owns_mobile_phone, owns_mobile_phone),
  access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)

# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))

# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.76      Mean   :1.408   Mean   :1.684   Mean   :3.403  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :3.000   Max.   :2.000   Max.   :6.000  
##  education_level health_insurance   ethnicity      wealth_index 
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :2.000   Median :1.0000   Median :1.000   Median :2.00  
##  Mean   :2.438   Mean   :0.8721   Mean   :1.527   Mean   :2.54  
##  3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.00  
##  Max.   :5.000   Max.   :1.0000   Max.   :4.000   Max.   :5.00  
##  current_contraceptive_use awareness_hiv_aids used_computer_tablet
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000      
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:0.0000      
##  Median :1.0000            Median :1.0000     Median :0.0000      
##  Mean   :0.6168            Mean   :0.8072     Mean   :0.3251      
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000      
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000      
##  used_internet    owns_mobile_phone access_to_media 
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000    1st Qu.:1.0000  
##  Median :1.0000   Median :1.0000    Median :1.0000  
##  Mean   :0.7192   Mean   :0.8886    Mean   :0.9071  
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
##        age_first_marriage            marital_status                      area 
##                         0                         0                         0 
##                    region           education_level          health_insurance 
##                         0                         0                         0 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                         0 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                         0                         0                         0 
##         owns_mobile_phone           access_to_media 
##                         0                         0

Visualization Comparing Before vs. After Imputation

# Histogram for 'age_first_marriage' before imputation
afm_0 <- ggplot(female_df, aes(x = age_first_marriage)) + 
  geom_histogram(fill = "lightpink", color = "black", bins = 30) +
  theme_light() +
  ggtitle("Age at First Marriage Before Imputation") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

# Histogram for 'age_first_marriage' after imputation
afm_imputed <- ggplot(imputed_df, aes(x = age_first_marriage)) + 
  geom_histogram(fill = "plum", color = "black", bins = 30) +
  theme_light() +
  ggtitle("Age at First Marriage After Imputation") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

# Arrange the two plots side by side
grid.arrange(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Arrange the two plots side by side and capture the layout as a grob
combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)

Creating Binary Variables for Child Marriage Under 18 and 16

# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
  select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)

EDA

# Calculate total observations
total_obs <- nrow(imputed_df)

# Aggregate counts for each category by region without altering the original 'region' field
counts_df <- aggregate(cbind(ever_married = imputed_df$marital_status %in% c(1, 2), 
                             married_u18 = imputed_df$child_marriage == 1, 
                             married_u16 = imputed_df$child_marriage_u16 == 1) ~ region, 
                       data = imputed_df, 
                       FUN = sum)

# Convert counts to percentages
counts_df$ever_married <- (counts_df$ever_married / total_obs) * 100
counts_df$married_u18 <- (counts_df$married_u18 / total_obs) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_obs) * 100


# Creating the plot with ggplot2, directly labeling regions
p <- ggplot(counts_df, aes(x = factor(region))) +
  geom_bar(aes(y = ever_married, fill = "Ever married or cohabited"), stat = "identity") +
  geom_bar(aes(y = married_u18, fill = "Married or cohabited before 18 years old"), stat = "identity") +
  geom_bar(aes(y = married_u16, fill = "Married or cohabited before 16 years old"), stat = "identity") +
  scale_fill_manual(values = c("Ever married or cohabited" = "snow2", 
                               "Married or cohabited before 18 years old" = "grey", 
                               "Married or cohabited before 16 years old" = "black"),
                    name = "Marital Status") +
  labs(x = "Region", y = "Percentage", title = "Prevalence of Child Marriage by Regions Among Females") +
  theme_minimal() +
  theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"), 
        axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.position = "right") +
  scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain", 
                              "3" = "North Central And Central Coastal", "4" = "Central Highlands", 
                              "5" = "South East", "6" = "Mekong River Delta"))

# Convert ggplot object to plotly for interactive visualization
ggplotly(p)

Preparing for Regression Analysis

Correlation Matrix

# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")

# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
    geom_tile() +
    scale_fill_gradientn(
        colours = c("deepskyblue", "white", "red2"),
        values = scales::rescale(c(-1, 0, 1)),
        limits = c(-1, 1),
        name="Pearson\nCorrelation"
    ) +
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("") + 
    ylab("") +
    ggtitle("Correlation Matrix") 

# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)

Converting Categorical and Binary Variables to Factors

# # Mapping for 'area'
# area_labels <- c("Urban", "Rural")
# names(area_labels) <- c("1", "2")
# 
# # Mapping for 'region'
# region_labels <- c("Red River Delta", "Northern Midlands And Mountain", "North Central And Central Coastal", "Central Highlands", "South East", "Mekong River Delta")
# names(region_labels) <- c("1", "2", "3", "4", "5", "6")
# 
# # Mapping for 'education_level'
# education_level_labels <- c("None or Pre-Primary", "Primary", "Lower Primary", "Upper Primary", "Vocational High School", "University/College/Higher")
# names(education_level_labels) <- c("0", "1", "2", "3", "4", "5") 
# 
# # Mapping for 'ethnicity'
# ethnicity_labels <- c("Kinh and Hoa", "Tay, Thai, Muong, Nung", "Khmer", "Mong")  # Add or modify as per your dataset
# names(ethnicity_labels) <- c("1", "2", "3", "4")  
# 
# # Mapping for 'wealth_index'
# wealth_index_labels <- c("Poorest", "Poor", "Middle", "Rich", "Richest")
# names(wealth_index_labels) <- c("1", "2", "3", "4", "5") 
# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)

# Binary variables are already in the correct format and can be used as is

Base Logistic Regression Model

# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)

# Summarize the baseline model
summary(baseline_model)
## 
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index + 
##     health_insurance + current_contraceptive_use + awareness_hiv_aids + 
##     access_to_media, family = binomial(), data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.73391    0.13169  -5.573 2.50e-08 ***
## area2                      0.45543    0.07999   5.693 1.25e-08 ***
## education_level1          -0.41350    0.08705  -4.750 2.03e-06 ***
## education_level2          -0.58693    0.08518  -6.890 5.57e-12 ***
## education_level3          -1.52807    0.11502 -13.286  < 2e-16 ***
## education_level4          -3.24812    0.51198  -6.344 2.24e-10 ***
## education_level5          -3.19482    0.25323 -12.616  < 2e-16 ***
## wealth_index2             -0.46183    0.07994  -5.777 7.61e-09 ***
## wealth_index3             -0.80552    0.09964  -8.085 6.24e-16 ***
## wealth_index4             -0.61389    0.10926  -5.618 1.93e-08 ***
## wealth_index5             -0.89341    0.14898  -5.997 2.01e-09 ***
## health_insurance           0.06907    0.08094   0.853    0.393    
## current_contraceptive_use  0.40123    0.06065   6.615 3.71e-11 ***
## awareness_hiv_aids        -0.47875    0.06960  -6.879 6.05e-12 ***
## access_to_media           -0.01461    0.08121  -0.180    0.857    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8493.7  on 11279  degrees of freedom
## AIC: 8523.7
## 
## Number of Fisher Scoring iterations: 7

Accessing Base Model’s Multicollinearity

# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
base_vif_results <- vif(baseline_model)
print(base_vif_results)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.110732  1        1.053913
## education_level           1.707863  5        1.054983
## wealth_index              1.401192  4        1.043067
## health_insurance          1.021120  1        1.010505
## current_contraceptive_use 1.034853  1        1.017277
## awareness_hiv_aids        1.500511  1        1.224954
## access_to_media           1.268704  1        1.126367
# Check if any VIF value is greater than a typical threshold, like 5 or 10.
base_high_vif <- base_vif_results[base_vif_results > 5]
print(base_high_vif)
## numeric(0)

Adding Fixed Effects to Base Model

# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, 
                      family = binomial(), 
                      data = imputed_df)

# Summarize the new model with FEs
summary(enhanced_model)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media, family = binomial(), 
##     data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.54522    0.17906  -8.630  < 2e-16 ***
## area2                      0.28689    0.08488   3.380 0.000725 ***
## region2                    0.37758    0.12919   2.923 0.003469 ** 
## region3                    0.18170    0.12745   1.426 0.153967    
## region4                    0.34057    0.12599   2.703 0.006869 ** 
## region5                   -0.05340    0.12612  -0.423 0.672028    
## region6                   -0.05277    0.13336  -0.396 0.692301    
## education_level1          -0.05925    0.09353  -0.634 0.526382    
## education_level2          -0.22571    0.09227  -2.446 0.014438 *  
## education_level3          -1.17196    0.12172  -9.629  < 2e-16 ***
## education_level4          -2.96956    0.51385  -5.779 7.51e-09 ***
## education_level5          -2.89574    0.25599 -11.312  < 2e-16 ***
## ethnicity2                 0.07771    0.10592   0.734 0.463184    
## ethnicity3                 0.27807    0.12830   2.167 0.030210 *  
## ethnicity4                 1.15504    0.10626  10.870  < 2e-16 ***
## wealth_index2             -0.12498    0.08523  -1.466 0.142533    
## wealth_index3             -0.46330    0.10565  -4.385 1.16e-05 ***
## wealth_index4             -0.26442    0.11695  -2.261 0.023760 *  
## wealth_index5             -0.54064    0.16030  -3.373 0.000744 ***
## health_insurance          -0.05157    0.08277  -0.623 0.533244    
## current_contraceptive_use  0.48480    0.06265   7.739 1.00e-14 ***
## awareness_hiv_aids        -0.19387    0.07592  -2.554 0.010661 *  
## access_to_media           -0.07880    0.08517  -0.925 0.354845    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8240.2  on 11271  degrees of freedom
## AIC: 8286.2
## 
## Number of Fisher Scoring iterations: 7

Model with FEs’ Multicollinearity

# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
FE_vif_results <- vif(enhanced_model)
print(FE_vif_results)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.254248  1        1.119932
## region                    4.916245  5        1.172636
## education_level           1.956030  5        1.069394
## ethnicity                 4.235677  3        1.272000
## wealth_index              1.934925  4        1.086008
## health_insurance          1.051524  1        1.025438
## current_contraceptive_use 1.049061  1        1.024237
## awareness_hiv_aids        1.679129  1        1.295812
## access_to_media           1.312005  1        1.145428
# Check if any VIF value is greater than a typical threshold, like 5.
FE_high_vif <- FE_vif_results[FE_vif_results > 5]
print(FE_high_vif)
## numeric(0)

No VIF is significant larger than 5. This suggests that the independent variables in my model with Fixed Effects do not suffer from severe multicollinearity.

Assessing AICs and BICs of Base Model vs Fixed Effects Model

# Model comparison using AIC
aic_base <- AIC(baseline_model)
aic_enhanced <- AIC(enhanced_model)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 8523.675
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 8286.168

Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.

# Model comparison using BIC
bic_base <- BIC(baseline_model)
bic_enhanced <- BIC(enhanced_model)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 8633.655
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 8454.805

Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.

Model with Interaction Terms (Area:Wealth_Index)

# Adding the interaction term between education level and wealth index
model_interaction <- update(enhanced_model, . ~ . + area:wealth_index)
summary(model_interaction)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + area:wealth_index, 
##     family = binomial(), data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.78525    0.21262  -8.396  < 2e-16 ***
## area2                      0.54558    0.15257   3.576 0.000349 ***
## region2                    0.36523    0.13009   2.807 0.004994 ** 
## region3                    0.16778    0.12857   1.305 0.191877    
## region4                    0.32337    0.12701   2.546 0.010892 *  
## region5                   -0.06803    0.12726  -0.535 0.592956    
## region6                   -0.06059    0.13410  -0.452 0.651371    
## education_level1          -0.06117    0.09352  -0.654 0.513085    
## education_level2          -0.21895    0.09231  -2.372 0.017701 *  
## education_level3          -1.17282    0.12176  -9.632  < 2e-16 ***
## education_level4          -2.96880    0.51416  -5.774 7.74e-09 ***
## education_level5          -2.89550    0.25820 -11.214  < 2e-16 ***
## ethnicity2                 0.06171    0.10624   0.581 0.561352    
## ethnicity3                 0.27262    0.12824   2.126 0.033518 *  
## ethnicity4                 1.12852    0.10687  10.559  < 2e-16 ***
## wealth_index2              0.26715    0.19445   1.374 0.169473    
## wealth_index3             -0.15465    0.21122  -0.732 0.464075    
## wealth_index4             -0.04152    0.22195  -0.187 0.851622    
## wealth_index5             -0.32837    0.26777  -1.226 0.220078    
## health_insurance          -0.04238    0.08289  -0.511 0.609164    
## current_contraceptive_use  0.49215    0.06274   7.844 4.35e-15 ***
## awareness_hiv_aids        -0.18696    0.07599  -2.460 0.013883 *  
## access_to_media           -0.07199    0.08522  -0.845 0.398196    
## area2:wealth_index2       -0.47937    0.21523  -2.227 0.025928 *  
## area2:wealth_index3       -0.38304    0.24189  -1.584 0.113299    
## area2:wealth_index4       -0.26224    0.25631  -1.023 0.306247    
## area2:wealth_index5       -0.25049    0.31993  -0.783 0.433646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8234.7  on 11267  degrees of freedom
## AIC: 8288.7
## 
## Number of Fisher Scoring iterations: 7
cat("AIC (base):", AIC(baseline_model), "\nBIC (base):", BIC(baseline_model), "\n")
## AIC (base): 8523.675 
## BIC (base): 8633.655
cat("AIC (w/ interaction terms):", AIC(model_interaction), "\nBIC (w/ interaction terms):", BIC(model_interaction), "\n")
## AIC (w/ interaction terms): 8288.662 
## BIC (w/ interaction terms): 8486.627

Logistic Regression Models Comparison (Odd Ratios and 95% Confidence Intervals)

# Function to add significance asterisks
add_asterisks <- function(p_value) {
  if (is.na(p_value)) {
    return(NA)
  } else if (p_value < 0.001) {
    return("***")
  } else if (p_value < 0.01) {
    return("**")
  } else if (p_value < 0.05) {
    return("*")
  } else {
    return("")
  }
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
  paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_enhanced <- tidy_enhanced %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_interaction <- tidy_interaction %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Baseline")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Enhanced")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Interaction")

# Combine and pivot the dataframes
combined_results <- bind_rows(
  tidy_baseline %>% select(term, OR, CI, Model),
  tidy_enhanced %>% select(term, OR, CI, Model),
  tidy_interaction %>% select(term, OR, CI, Model)
) %>%
  pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
  select(term, 
         OR_Baseline, CI_Baseline, 
         OR_Enhanced, CI_Enhanced, 
         OR_Interaction, CI_Interaction)
# Print the final combined table
print(combined_results)
## # A tibble: 27 × 7
##    term           OR_Baseline CI_Baseline OR_Enhanced CI_Enhanced OR_Interaction
##    <chr>          <chr>       <chr>       <chr>       <chr>       <chr>         
##  1 (Intercept)    0.48***     (0.37, 0.6… 0.21***     (0.15, 0.3) 0.17***       
##  2 area2          1.58***     (1.35, 1.8… 1.33***     (1.13, 1.5… 1.73***       
##  3 education_lev… 0.66***     (0.56, 0.7… 0.94        (0.78, 1.1… 0.94          
##  4 education_lev… 0.56***     (0.47, 0.6… 0.8*        (0.67, 0.9… 0.8*          
##  5 education_lev… 0.22***     (0.17, 0.2… 0.31***     (0.24, 0.3… 0.31***       
##  6 education_lev… 0.04***     (0.01, 0.0… 0.05***     (0.02, 0.1… 0.05***       
##  7 education_lev… 0.04***     (0.02, 0.0… 0.06***     (0.03, 0.0… 0.06***       
##  8 wealth_index2  0.63***     (0.54, 0.7… 0.88        (0.75, 1.0… 1.31          
##  9 wealth_index3  0.45***     (0.37, 0.5… 0.63***     (0.51, 0.7… 0.86          
## 10 wealth_index4  0.54***     (0.44, 0.6… 0.77*       (0.61, 0.9… 0.96          
## # ℹ 17 more rows
## # ℹ 1 more variable: CI_Interaction <chr>